{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train\n",
    "# !wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse(file):\n",
    "    with open(file) as fopen:\n",
    "        texts = fopen.read().split('\\n')\n",
    "    left, right = [], []\n",
    "    for text in texts:\n",
    "        if '-DOCSTART-' in text or not len(text):\n",
    "            continue\n",
    "        splitted = text.split()\n",
    "        left.append(splitted[0])\n",
    "        right.append(splitted[-1])\n",
    "    return left, right"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "left_train, right_train = parse('eng.train')\n",
    "left_test, right_test = parse('eng.testa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_string(string):\n",
    "    string = re.sub('[^A-Za-z0-9\\-\\/ ]+', ' ', string).split()\n",
    "    return ' '.join([to_title(y.strip()) for y in string])\n",
    "\n",
    "def to_title(string):\n",
    "    if string.isupper():\n",
    "        string = string.title()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER',\n",
       "        'O'], dtype='<U6'),\n",
       " array([    11,     37,     24,   8286,   4556,  10001,  11128, 169578]))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(right_train,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "word2idx = {'PAD': 0,'NUM':1,'UNK':2}\n",
    "tag2idx = {'PAD': 0}\n",
    "char2idx = {'PAD': 0}\n",
    "word_idx = 3\n",
    "tag_idx = 1\n",
    "char_idx = 1\n",
    "\n",
    "def parse_XY(texts, labels):\n",
    "    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n",
    "    X, Y = [], []\n",
    "    for no, text in enumerate(texts):\n",
    "        text = text.lower()\n",
    "        tag = labels[no]\n",
    "        for c in text:\n",
    "            if c not in char2idx:\n",
    "                char2idx[c] = char_idx\n",
    "                char_idx += 1\n",
    "        if tag not in tag2idx:\n",
    "            tag2idx[tag] = tag_idx\n",
    "            tag_idx += 1\n",
    "        Y.append(tag2idx[tag])\n",
    "        if text not in word2idx:\n",
    "            word2idx[text] = word_idx\n",
    "            word_idx += 1\n",
    "        X.append(word2idx[text])\n",
    "    return X, np.array(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, train_Y = parse_XY(left_train, right_train)\n",
    "test_X, test_Y = parse_XY(left_test, right_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx2word = {idx: tag for tag, idx in word2idx.items()}\n",
    "idx2tag = {i: w for w, i in tag2idx.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq_len = 50\n",
    "def iter_seq(x):\n",
    "    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])\n",
    "\n",
    "def to_train_seq(*args):\n",
    "    return [iter_seq(x) for x in args]\n",
    "\n",
    "def generate_char_seq(batch):\n",
    "    x = [[len(idx2word[i]) for i in k] for k in batch]\n",
    "    maxlen = max([j for i in x for j in i])\n",
    "    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)\n",
    "    for i in range(batch.shape[0]):\n",
    "        for k in range(batch.shape[1]):\n",
    "            for no, c in enumerate(idx2word[batch[i,k]]):\n",
    "                temp[i,k,-1-no] = char2idx[c]\n",
    "    return temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(203571, 50)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_seq, Y_seq = to_train_seq(train_X, train_Y)\n",
    "X_char_seq = generate_char_seq(X_seq)\n",
    "X_seq.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(51312, 50)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_seq_test, Y_seq_test = to_train_seq(test_X, test_Y)\n",
    "X_char_seq_test = generate_char_seq(X_seq_test)\n",
    "X_seq_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, train_Y, train_char = X_seq, Y_seq, X_char_seq\n",
    "test_X, test_Y, test_char = X_seq_test, Y_seq_test, X_char_seq_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(\n",
    "        self,\n",
    "        dim_word,\n",
    "        dim_char,\n",
    "        dropout,\n",
    "        learning_rate,\n",
    "        hidden_size_char,\n",
    "        hidden_size_word,\n",
    "        num_layers,\n",
    "    ):\n",
    "        def cells(size, reuse = False):\n",
    "            return tf.contrib.rnn.DropoutWrapper(\n",
    "                tf.nn.rnn_cell.LSTMCell(\n",
    "                    size,\n",
    "                    initializer = tf.orthogonal_initializer(),\n",
    "                    reuse = reuse,\n",
    "                ),\n",
    "                output_keep_prob = dropout,\n",
    "            )\n",
    "\n",
    "        def luong(embedded, size):\n",
    "            attention_mechanism = tf.contrib.seq2seq.LuongAttention(\n",
    "                num_units = hidden_size_word, memory = embedded\n",
    "            )\n",
    "            return tf.contrib.seq2seq.AttentionWrapper(\n",
    "                cell = cells(hidden_size_word),\n",
    "                attention_mechanism = attention_mechanism,\n",
    "                attention_layer_size = hidden_size_word,\n",
    "            )\n",
    "        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])\n",
    "        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])\n",
    "        self.labels = tf.placeholder(tf.int32, shape = [None, None])\n",
    "        self.maxlen = tf.shape(self.word_ids)[1]\n",
    "        self.lengths = tf.count_nonzero(self.word_ids, 1)\n",
    "\n",
    "        self.word_embeddings = tf.Variable(\n",
    "            tf.truncated_normal(\n",
    "                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)\n",
    "            )\n",
    "        )\n",
    "        self.char_embeddings = tf.Variable(\n",
    "            tf.truncated_normal(\n",
    "                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)\n",
    "            )\n",
    "        )\n",
    "\n",
    "        word_embedded = tf.nn.embedding_lookup(\n",
    "            self.word_embeddings, self.word_ids\n",
    "        )\n",
    "        char_embedded = tf.nn.embedding_lookup(\n",
    "            self.char_embeddings, self.char_ids\n",
    "        )\n",
    "        s = tf.shape(char_embedded)\n",
    "        char_embedded = tf.reshape(\n",
    "            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]\n",
    "        )\n",
    "        for n in range(num_layers):\n",
    "            (out_fw, out_bw), (\n",
    "                state_fw,\n",
    "                state_bw,\n",
    "            ) = tf.nn.bidirectional_dynamic_rnn(\n",
    "                cell_fw = cells(hidden_size_char),\n",
    "                cell_bw = cells(hidden_size_char),\n",
    "                inputs = char_embedded,\n",
    "                dtype = tf.float32,\n",
    "                scope = 'bidirectional_rnn_char_%d' % (n),\n",
    "            )\n",
    "            char_embedded = tf.concat((out_fw, out_bw), 2)\n",
    "        output = tf.reshape(\n",
    "            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]\n",
    "        )\n",
    "        word_embedded = tf.concat([word_embedded, output], axis = -1)\n",
    "\n",
    "        for n in range(num_layers):\n",
    "            (out_fw, out_bw), (\n",
    "                state_fw,\n",
    "                state_bw,\n",
    "            ) = tf.nn.bidirectional_dynamic_rnn(\n",
    "                cell_fw = luong(word_embedded, hidden_size_word),\n",
    "                cell_bw = luong(word_embedded, hidden_size_word),\n",
    "                inputs = word_embedded,\n",
    "                dtype = tf.float32,\n",
    "                scope = 'bidirectional_rnn_word_%d' % (n),\n",
    "            )\n",
    "            word_embedded = tf.concat((out_fw, out_bw), 2)\n",
    "        logits = tf.layers.dense(word_embedded, len(idx2tag))\n",
    "        y_t = self.labels\n",
    "        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(\n",
    "            logits, y_t, self.lengths\n",
    "        )\n",
    "        self.cost = tf.reduce_mean(-log_likelihood)\n",
    "        self.optimizer = tf.train.AdamOptimizer(\n",
    "            learning_rate = learning_rate\n",
    "        ).minimize(self.cost)\n",
    "        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)\n",
    "        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(\n",
    "            logits, transition_params, self.lengths\n",
    "        )\n",
    "        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')\n",
    "\n",
    "        y_t = tf.cast(y_t, tf.int32)\n",
    "        self.prediction = tf.boolean_mask(self.tags_seq, mask)\n",
    "        mask_label = tf.boolean_mask(y_t, mask)\n",
    "        correct_pred = tf.equal(self.prediction, mask_label)\n",
    "        correct_index = tf.cast(correct_pred, tf.float32)\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Colocations handled automatically by placer.\n",
      "\n",
      "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
      "For more information, please see:\n",
      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
      "  * https://github.com/tensorflow/addons\n",
      "If you depend on functionality not listed there, please file an issue.\n",
      "\n",
      "WARNING:tensorflow:From <ipython-input-14-4e9d587a5852>:17: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.\n",
      "WARNING:tensorflow:From <ipython-input-14-4e9d587a5852>:67: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/rnn.py:443: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/rnn_cell_impl.py:1259: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n",
      "WARNING:tensorflow:From <ipython-input-14-4e9d587a5852>:87: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use keras.layers.dense instead.\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/rnn.py:626: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.cast instead.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:110: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "\n",
    "dim_word = 64\n",
    "dim_char = 128\n",
    "dropout = 0.8\n",
    "learning_rate = 1e-3\n",
    "hidden_size_char = 128\n",
    "hidden_size_word = 128\n",
    "num_layers = 2\n",
    "batch_size = 128\n",
    "\n",
    "model = Model(dim_word,dim_char,dropout,learning_rate,\n",
    "              hidden_size_char,hidden_size_word,num_layers)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 1591/1591 [18:28<00:00,  1.56it/s, accuracy=0.73, cost=35.4] \n",
      "test minibatch loop: 100%|██████████| 401/401 [01:36<00:00,  4.18it/s, accuracy=0.883, cost=20.8]\n",
      "train minibatch loop:   0%|          | 0/1591 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 1204.4710218906403\n",
      "epoch: 0, training loss: 22.680630, training acc: 0.868153, valid loss: 28.759542, valid acc: 0.809839\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 1591/1591 [18:14<00:00,  1.57it/s, accuracy=0.952, cost=5.96] \n",
      "test minibatch loop: 100%|██████████| 401/401 [01:34<00:00,  4.28it/s, accuracy=0.905, cost=9.9] \n",
      "train minibatch loop:   0%|          | 0/1591 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 1188.9075050354004\n",
      "epoch: 1, training loss: 15.223893, training acc: 0.889726, valid loss: 12.128838, valid acc: 0.906151\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 1591/1591 [18:11<00:00,  1.59it/s, accuracy=0.987, cost=2.23] \n",
      "test minibatch loop: 100%|██████████| 401/401 [01:34<00:00,  4.26it/s, accuracy=0.906, cost=8.09]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 1186.5152912139893\n",
      "epoch: 2, training loss: 8.237849, training acc: 0.935369, valid loss: 9.531218, valid acc: 0.932109\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "for e in range(3):\n",
    "    lasttime = time.time()\n",
    "    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
    "    pbar = tqdm(\n",
    "        range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n",
    "    )\n",
    "    for i in pbar:\n",
    "        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]\n",
    "        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]\n",
    "        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]\n",
    "        acc, cost, _ = sess.run(\n",
    "            [model.accuracy, model.cost, model.optimizer],\n",
    "            feed_dict = {\n",
    "                model.word_ids: batch_x,\n",
    "                model.char_ids: batch_char,\n",
    "                model.labels: batch_y\n",
    "            },\n",
    "        )\n",
    "        assert not np.isnan(cost)\n",
    "        train_loss += cost\n",
    "        train_acc += acc\n",
    "        pbar.set_postfix(cost = cost, accuracy = acc)\n",
    "    pbar = tqdm(\n",
    "        range(0, len(test_X), batch_size), desc = 'test minibatch loop'\n",
    "    )\n",
    "    for i in pbar:\n",
    "        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]\n",
    "        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]\n",
    "        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]\n",
    "        acc, cost = sess.run(\n",
    "            [model.accuracy, model.cost],\n",
    "            feed_dict = {\n",
    "                model.word_ids: batch_x,\n",
    "                model.char_ids: batch_char,\n",
    "                model.labels: batch_y\n",
    "            },\n",
    "        )\n",
    "        assert not np.isnan(cost)\n",
    "        test_loss += cost\n",
    "        test_acc += acc\n",
    "        pbar.set_postfix(cost = cost, accuracy = acc)\n",
    "    \n",
    "    train_loss /= len(train_X) / batch_size\n",
    "    train_acc /= len(train_X) / batch_size\n",
    "    test_loss /= len(test_X) / batch_size\n",
    "    test_acc /= len(test_X) / batch_size\n",
    "\n",
    "    print('time taken:', time.time() - lasttime)\n",
    "    print(\n",
    "        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'\n",
    "        % (e, train_loss, train_acc, test_loss, test_acc)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pred2label(pred):\n",
    "    out = []\n",
    "    for pred_i in pred:\n",
    "        out_i = []\n",
    "        for p in pred_i:\n",
    "            out_i.append(idx2tag[p])\n",
    "        out.append(out_i)\n",
    "    return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "validation minibatch loop: 100%|██████████| 401/401 [01:36<00:00,  4.23it/s]\n"
     ]
    }
   ],
   "source": [
    "real_Y, predict_Y = [], []\n",
    "\n",
    "pbar = tqdm(\n",
    "    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'\n",
    ")\n",
    "for i in pbar:\n",
    "    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]\n",
    "    batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]\n",
    "    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]\n",
    "    predicted = pred2label(sess.run(model.tags_seq,\n",
    "            feed_dict = {\n",
    "                model.word_ids: batch_x,\n",
    "                model.char_ids: batch_char,\n",
    "            },\n",
    "    ))\n",
    "    real = pred2label(batch_y)\n",
    "    predict_Y.extend(predicted)\n",
    "    real_Y.extend(real)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "     B-MISC       0.00      0.00      0.00       200\n",
      "      I-LOC       0.80      0.78      0.79    104662\n",
      "     I-MISC       0.78      0.42      0.54     63129\n",
      "      I-ORG       0.66      0.36      0.46    104387\n",
      "      I-PER       0.69      0.90      0.78    157385\n",
      "          O       0.97      0.99      0.98   2135837\n",
      "\n",
      "avg / total       0.93      0.93      0.93   2565600\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jupyter/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
